ASER Pakistan 2016

In this piece of paper, a set of data obtained from Annual Status of Education Report (ASER) is explored. The raw data was downloaded from the link here. https://palnetwork.org/aser-centre/

Preparation

Packages Used

library(tidyverse)
library(ggplot2)
library(ggthemes)
library(ggrepel)
library(gghighlight)
library(stringr)
library(dplyr)
library(sf)
library(scatterplot3d)
library(car)
# library(broom)
require(ggiraph)
## Warning: package 'ggiraph' was built under R version 4.0.3
require(ggiraphExtra)

# require(plyr)

Data Installation

provdist <- read.csv("aser/ASER2016ProvDist.csv")
school <- read.csv("aser/ASER2016GSchool.csv")
child <- read.csv("aser/ASER2016Child.csv")
pschool <- read.csv("aser/ASER2016PvtSchool.csv")
gschool <- read.csv("aser/ASER2016GSchool.csv")
parent <- read.csv("aser/ASER2016Parent.csv")
house <- read.csv("aser/ASER2016HouseholdIndicators.csv")
RegionName <- c("2" = "Panjab", 
                "3" = "Sindh", 
                "4" = "Balochistan", 
                "5" = "Khyber Pakhtunkhwa", 
                "6" = "Gilgit-Baltistan", 
                "7" = "Azad Jammu and Kashmir", 
                "8" = "Islamabad - ICT", 
                "9" = "Federally Administrated Tribal Areas")
Gender <- c("0" = "Male",
            "-1" = "Female")

Exploration

Checking Samplesizes

length(unique(child$CID))
## [1] 255196

The whole samplesize (the numebr of children) of this dataset is 255196.

child %>% 
  filter(DID == 266) %>% 
  summarize(N_hunza = length(unique(CID)))

The samplesize of Hunza alone is 1641.

Exploration in Hunza

Gender Proportion

child %>% 
  filter(DID == 266) %>% 
  summarize(gender_proportion = mean(C002))

-1: female, 0: male
gender_proportion = -0.5173675 means there are a little more girls in the dataset.

Age of Children (C001)

child %>% 
  filter(DID == 266) %>% 
  ggplot(aes(C001)) +
  geom_histogram()

Age is well sparsed

Eduation Status

child %>% 
  filter(DID == 266) %>% 
  ggplot(aes(C003)) +
  geom_histogram(bins = 3)

1 = never enrolled; 2 = drop-out; 3 = currently enrolled

Education Status by Gender

child %>% 
  filter(DID == 266) %>% 
  ggplot(aes(C003)) +
  geom_histogram(bins = 3, binwidth = 1) +
  facet_grid(~C002, labeller = labeller(C002 = Gender))

Both genders look pretty good interms of the absolute number of currently-enrolled-children

The Enrollment Rate by Gender

child %>% 
  filter(DID == 266) %>% 
  group_by(C002) %>% 
  mutate(enrollment_rate = mean(C003 == 3)) %>% 
  ggplot(aes(C002, enrollment_rate)) +
  geom_col() +
  scale_y_continuous() +
  geom_label(aes(label = enrollment_rate)) +
  scale_x_continuous(breaks = c(-1, 0), labels = c("Female", "Male"))

As a rate, both are doing pretty good

Currently-Enrolled: Institution Type (C006) in Hunza

1 = Government; 2 = Private; 3 = Madrasah(Conventional religious education) School; 4 = Other(Non formal education facility)

child %>% 
  filter(DID == 266) %>% 
  ggplot(aes(C006)) +
  geom_histogram()

Most children go to public schools or private schools

Within Gilgit-Baltistan

child %>% 
  filter(RID == 6) %>% 
  group_by(DID) %>% 
  mutate(Current_Enrollment_Rate = mean(C003 == 3)) %>% 
  ggplot(aes(DID, Current_Enrollment_Rate)) +
  geom_count() +
  scale_x_continuous(breaks = 260:266, labels = c("Gilgit", "Diamer", "Skardu", "Ghanshe", "Astore", "Ghizer", "Hunza-Nagar"))

Within Gilgit-Baltistan, Hunza is outperforming.

Basic Learning Levels: Reading in Local/National Language (C010)

1 = Begginer/Nothing; 2 = Letters; 3 = Words; 4 = Sentences; 5 = Story

child %>% 
  filter(DID == 266) %>% 
  ggplot(aes(C010)) +
  geom_histogram()

child %>% 
  filter(DID == 266) %>% 
  summarize(na = sum(is.na(C010)))

Basic Learning Levels by Age

child %>% 
  filter(DID == 266, C013 != c(3,4)) %>% 
  ggplot(aes(C010)) +
  geom_histogram() +
  facet_grid(~C001)

# children at he age of 3 and 4 are removed for they have not data

Basic Learning Levels by Gender

child %>% 
  filter(DID == 266) %>% 
  ggplot(aes(C010)) +
  geom_histogram() +
  facet_grid(~C002, labeller = labeller(C002 = Gender))

English Reading Levels (C013)

child %>% 
  filter(DID == 266, C013 != c(3,4)) %>% 
  ggplot(aes(C013)) +
  geom_histogram() +
  facet_grid(~C001)

# children at he age of 3 and 4 are removed for they have not data

English Reading Levels by Gender

child %>% 
  filter(DID == 266) %>% 
  ggplot(aes(C013)) +
  geom_histogram() +
  facet_grid(~C002, labeller = labeller(C002 = Gender))

child %>% 
  filter(DID == 266) %>% 
  group_by(C002) %>% 
  summarize(enrollment_rate = mean(C003 == 3, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
child %>% 
  group_by(DID, C002) %>% 
  mutate(enrollment_rate = mean(C003 == 3, na.rm = TRUE)) %>% 
  ggplot(aes(DID, enrollment_rate, color = factor(RID), label = C002)) +
  geom_point(show.legend = FALSE) +
  geom_text(aes(label = C002), show.legend = FALSE, nudge_y = 0.05) 
## Warning: Removed 66 rows containing missing values (geom_text).

Comparison between Other Region

Current Enrollment Rate

child %>% 
  group_by(DID) %>% 
  mutate(avg = round(mean(C003 == 3), digits = 2)) %>% 
  ungroup() %>% 
  ggplot(aes(avg)) +
  geom_histogram() +
  facet_grid(~RID, labeller = labeller(RID = RegionName)) +
  labs(title = "Current Enrollment Rate by Region")

Female Average Learning Levels (Age or other variables are NOT adjusted)

child %>% 
  filter(C002 == -1) %>% 
  group_by(DID) %>% 
  mutate(avg_learning = mean(C010, na.rm = TRUE)) %>% 
  ggplot(aes(DID, avg_learning, color = RID)) +
  geom_point() +
  geom_text(aes(label = DID), nudge_x = 5, check_overlap = TRUE)

child %>% 
  filter(C002 == -1) %>% 
  group_by(DID) %>% 
  mutate(avg_learning = mean(C010, na.rm = TRUE)) %>% 
  ggplot(aes(DID, avg_learning, color = RID)) +
  geom_point() +
  geom_text(aes(label = DID), nudge_x = 5, check_overlap = TRUE) +
  gghighlight(RID == 6)

It is interesting to note that Gilgit-Baltistan(RID==6) has a huge diversity in average learning levels of girls and Hunza(DID==266) is in the top group of all region.

Spatial Analysis

Districts Map

ica_df <- ica %>% 
  mutate(centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[,1],
    y = st_coordinates(centroid)[,2]) %>% 
  as.data.frame()
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
ica_df <- ica_df %>% select(Province, Districts, x, y)
ica_df <- ica_df %>% summarize(Province = tolower(Province), Districts = tolower(Districts), x = x, y = y)

Child and ProvDist data combination

child_dname <- child %>% left_join(provdist[-1])
## Joining, by = "DID"
child_dname <- child_dname %>% mutate(dname = tolower(DNAME))
ica_df_3 <- ica_df %>% filter(Province == "sindh")

ica_df_3$Districts <- ica_df_3$Districts %>%  
  str_replace("ghotki", "gotki") %>%
  str_replace("mirpur khas", "mirpurkhas") %>% 
  str_replace("malir karachi", "karachi-malir-rural") %>% 
  str_replace("naushahro feroze", "nowshero feroze") %>% 
  str_replace("kambar shahdad kot", "qambar shahdadkot") %>% 
  str_replace("sujawal", "sajawal") %>% 
  str_replace("shaheed benazir abad", "shaheed benazirabad") %>% 
  str_replace("tando allahyar", "tando allah yar") %>% 
  as.vector()

child_dname_3 <- child_dname %>% filter(RNAME == "Sindh") %>% left_join(ica_df_3, by = c("dname" = "Districts"))

child_dname_3 %>% group_by(dname) %>% summarize(n = sum(x))
## `summarise()` ungrouping output (override with `.groups` argument)
ica_df_3

District IDs

ica %>% 
  mutate(centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[,1],
    y = st_coordinates(centroid)[,2]) %>% 
    ggplot() +
  geom_sf() +
  geom_point(data = child_ica, aes(x, y, label = DID, color = factor(RID))) +
  geom_text(data = child_ica, aes(x, y, label = DID), size = 5, nudge_y = 0.2, check_overlap = TRUE)
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 8541 rows containing missing values (geom_point).
## Warning: Removed 8541 rows containing missing values (geom_text).

Gilgit-Baltistan Region

ica %>% 
  mutate(centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[,1],
    y = st_coordinates(centroid)[,2]) %>% 
    ggplot() +
  geom_sf() +
  geom_point(data = child_ica, aes(x, y, label = DNAME, shape = factor(RID), color = factor(RID))) +
  geom_text(data = child_ica, aes(x, y, label = DNAME), size = 4.5, nudge_y = 0.2, check_overlap = TRUE) +
  scale_shape_manual(values = 0:8) +
  coord_sf(xlim = c(71, 77.9), ylim = c(34, 37.1))
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 8541 rows containing missing values (geom_point).
## Warning: Removed 8541 rows containing missing values (geom_text).

Latitudes and longitudes of Gilgit-Baltistan region

child_ica %>% 
  filter(RID == 6) %>% 
  summarise(xmin = min(x), xmax = max(x), ymin = min(y), ymax = max(y))

Average enrollment rate

ica %>% 
  mutate(centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[,1],
    y = st_coordinates(centroid)[,2]) %>% 
    ggplot() +
  geom_sf() +
  geom_point(data = child_ica %>% group_by(DID) %>% 
               mutate(avg_enroll = mean(C003 == 3)), 
             aes(x, y, label = avg_enroll, color = avg_enroll))

  # geom_text(data = child_ica%>% group_by(DID) %>% 
  #             mutate(avg_enroll = mean(C003 == 3)), 
  #           aes(x, y, avg_enroll), check_overlap = TRUE, nudge_y = 0.5)

Average enrollment rate of female children

ica %>% 
  mutate(centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[,1],
    y = st_coordinates(centroid)[,2]) %>% 
    ggplot() +
  geom_sf() +
  geom_point(data = child_ica %>% filter(C002 == -1) %>% group_by(DID) %>% 
               mutate(avg_enroll = mean(C003 == 3)), 
             aes(x, y, label = avg_enroll, color = avg_enroll))
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 3566 rows containing missing values (geom_point).

ica %>% 
  mutate(centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[,1],
    y = st_coordinates(centroid)[,2]) %>% 
    ggplot() +
  geom_sf() +
  geom_point(data = child_ica, aes(x, y, label = C010, color = C010))

  # geom_text(data = child_ica, aes(x, y, label = C010), check_overlap = TRUE, nudge_y = 1)

Traial: children, gender

ica %>% 
  mutate(centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[,1],
    y = st_coordinates(centroid)[,2]) %>% 
    ggplot() +
  geom_sf() +
  geom_point(data = child_ica %>% 
               group_by(DID) %>%
               mutate(gender_ratio = mean(C002)), 
             aes(x, y, label = gender_ratio, color = gender_ratio))

0: male, -1: female

Thus, -0.5 indicates the complete gender parity

ica %>% 
  mutate(centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[,1],
    y = st_coordinates(centroid)[,2]) %>% 
    ggplot() +
  geom_sf() +
  geom_point(data = child_ica %>% group_by(DID) %>% 
  mutate(gender_ratio = mean(C002, na.rm = TRUE)), aes(x, y, label = gender_ratio, color = gender_ratio, size = -gender_ratio))

ica %>% 
  mutate(centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[,1],
    y = st_coordinates(centroid)[,2]) %>% 
    ggplot() +
  geom_sf() +
  geom_point(data = (child_ica %>% filter(C001 == 14:16) %>% group_by(DID) %>% 
               mutate(avg_learning_level = mean(C010, na.rm = TRUE))), aes(x, y, label = avg_learning_level, color = avg_learning_level, size = avg_learning_level))

ica %>% 
  mutate(centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[,1],
    y = st_coordinates(centroid)[,2]) %>% 
    ggplot() +
  geom_sf() +
  geom_point(data = (child_ica %>% filter(C001 == 12:16) %>% group_by(DID) %>%
                       mutate(gender_ratio = mean(C002, na.rm = TRUE), avg_learning_level = mean(C010, na.rm = TRUE))), aes(x, y, label = gender_ratio, color = gender_ratio, size = avg_learning_level)) +
  theme(axis.title = element_text())

Average Local/National Language Level Across Ages by Rgion

child_ica %>% 
  filter(C002 == c(0,-1)) %>% 
  group_by(DID, C001, C002) %>% 
  mutate(avg_local = mean(C010, na.rm = TRUE)) %>% 
  ggplot(aes(C001, avg_local, color = factor(C002), group = DID)) +
  geom_line(show.legend = FALSE) +
  labs(x = "Age") +
  facet_grid(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender)) +
  facet_wrap(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender))

Average Enrollment Rate Across Ages by Rgion

child_ica %>% 
  filter(C002 == c(0,-1)) %>% 
  group_by(DID, C001, C002) %>% 
  mutate(avg_enrollment = mean(C003 == 3, na.rm = TRUE)) %>% 
  ggplot(aes(C001, avg_enrollment, color = factor(DID), group = DID)) +
  geom_line(show.legend = FALSE) +
  labs(x = "Age") +
  facet_grid(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender)) +
  facet_wrap(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender))

Hunza

child_ica %>% 
  filter(C002 != "NA", RID == 6, RID != "NA") %>% 
  group_by(DID, C001, C002) %>% 
  mutate(avg_enrollment = mean(C003 == 3, na.rm = TRUE)) %>% 
  ggplot(aes(C001, avg_enrollment, color = factor(DID), group = DID)) +
  geom_line() +
  labs(x = "Age") +
  facet_grid(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender)) +
  facet_wrap(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender)) +
  gghighlight(label_key = DNAME, calculate_per_facet = TRUE)

child_ica %>% 
  filter(C002 != "NA", RID == 6, RID != "NA") %>% 
  group_by(DID, C001, C002) %>% 
  mutate(avg_enrollment = mean(C003 == 3, na.rm = TRUE)) %>% 
  ggplot(aes(C001, avg_enrollment, color = factor(DID), group = DID)) +
  geom_line() +
  labs(x = "Age") +
  facet_grid(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender)) +
  facet_wrap(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender)) +
  gghighlight(DID == 266,label_key = DNAME, calculate_per_facet = TRUE)
## Warning: Tried to calculate with group_by(), but the calculation failed.
## Falling back to ungrouped filter operation...

Karachi

child_ica %>% 
  filter(C002 != "NA", RID == 3) %>% 
  group_by(DID, C001, C002) %>% 
  mutate(avg_enrollment = mean(C003 == 3, na.rm = TRUE)) %>% 
  ggplot(aes(C001, avg_enrollment, color = factor(DID), group = DID)) +
  geom_line() +
  labs(x = "Age") +
  facet_grid(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender)) +
  facet_wrap(RID~C002, labeller = labeller(RID = RegionName, C002 = Gender)) +
  gghighlight(DID == c(315,316), label_key = DNAME, calculate_per_facet = TRUE)

Child & Parents

child_ica %>%
  filter(RID == 6, PR004 != "NA") %>% 
  group_by(DID, PR004) %>% 
  mutate(avg_enrollment = mean(C003 == 3, na.rm = TRUE)) %>% 
  ggplot(aes(DID, avg_enrollment, fill = factor(PR004))) +
  geom_col(aes(factor(DID), avg_enrollment, fill = factor(PR004)), position = "dodge")

Within Gilgit-Baltistan, Hunza is the only district where there are more women who have experience of education than those who have never enrolled in schools.

child_ica %>%
  filter(RID == 6, PR009 != "NA") %>% 
  group_by(DID, PR009) %>% 
  mutate(avg_enrollment = mean(C003 == 3, na.rm = TRUE)) %>% 
  ggplot(aes(DID, avg_enrollment, group = PR009, fill = factor(PR009))) +
  geom_col(aes(factor(DID), avg_enrollment), position = "dodge")

child_ica %>% 
  filter(PR009 != "NA", PR004 != "NA") %>% 
  group_by(DID) %>% 
  mutate(avg_learning = mean(C003 == 3)) %>% 
  ggplot(aes(PR004, avg_learning, group = PR004)) +
  geom_boxplot(aes(PR004, avg_learning)) +
  facet_grid(.~RID)

child_ica %>% 
  filter(PR009 != "NA", PR004 != "NA") %>% 
  group_by(DID) %>% 
  mutate(avg_learning = mean(C003 == 3)) %>% 
  ggplot(aes(PR009, avg_learning, group = PR009)) +
  geom_boxplot(aes(PR009, avg_learning)) +
  facet_grid(.~RID)

child_ica %>% 
  filter(PR009 != "NA", PR004 != "NA") %>% 
  group_by(DID) %>% 
  mutate(avg_learning = mean(C003 == 3, na.rm = TRUE), mother_enrollment = mean(PR004, na.rm = TRUE)) %>% 
  ggplot(aes(mother_enrollment, avg_learning)) +
  geom_point()

child_ica %>% 
  filter(PR009 != "NA", PR004 != "NA") %>% 
  group_by(DID) %>% 
  mutate(avg_learning = mean(C003 == 3, na.rm = TRUE), mother_enrollment = mean(PR004, na.rm = TRUE)) %>% 
  ungroup() %>% 
  summarize(r = cor(mother_enrollment, avg_learning))
child_ica %>% 
  filter(PR009 != "NA", PR004 != "NA") %>% 
  group_by(DID) %>% 
  mutate(avg_learning = mean(C003 == 3, na.rm = TRUE), father_enrollment = mean(PR009, na.rm = TRUE)) %>% 
  ggplot(aes(father_enrollment, avg_learning)) +
  geom_point()

child_ica %>% 
  filter(PR009 != "NA", PR004 != "NA") %>% 
  group_by(DID) %>% 
  mutate(avg_learning = mean(C003 == 3), father_enrollment = mean(PR009)) %>% 
  ungroup() %>% 
  summarize(r = cor(father_enrollment, avg_learning))
child_ica %>% 
  filter(PR009 != "NA", PR004 != "NA", C003 != "NA", DID == 266, C001 != "NA") %>% 
  group_by(PR004, C001) %>% 
  mutate(avg_learning = mean(C010, na.rm = TRUE)) %>% 
  ggplot(aes(C001, avg_learning, group = PR004, color = factor(PR004))) +
  geom_line(aes(C001, avg_learning))
## Warning: Removed 200 row(s) containing missing values (geom_path).

child_ica %>% 
  filter(PR009 != "NA", PR004 != "NA", C003 != "NA", DID == 266, C001 != "NA") %>% 
  group_by(PR009, C001) %>% 
  mutate(avg_learning = mean(C010, na.rm = TRUE)) %>% 
  ggplot(aes(C001, avg_learning, group = PR009, color = factor(PR009))) +
  geom_line(aes(C001, avg_learning))
## Warning: Removed 200 row(s) containing missing values (geom_path).

child_ica %>% 
  filter(PR009 != "NA", 
         PR004 != "NA", 
         C003 != "NA", 
         C001 != "NA",
         RID == 6) %>% 
  group_by(DID, PR004, C001) %>% 
  mutate(avg_learning = mean(C010, na.rm = TRUE)) %>% 
  ggplot(aes(C001, avg_learning, group = PR004, color = factor(PR004))) +
  geom_line(aes(C001, avg_learning)) +
  facet_grid(.~DID) +
  facet_wrap(.~DID)
## Warning: Removed 226 row(s) containing missing values (geom_path).

child_ica %>% 
  filter(PR009 != "NA", 
         PR004 != "NA", 
         C003 != "NA", 
         C001 != "NA",
         RID == 6) %>% 
  group_by(DID, PR009, C001) %>% 
  mutate(avg_learning = mean(C010, na.rm = TRUE)) %>% 
  ggplot(aes(C001, avg_learning, group = PR009, color = factor(PR009))) +
  geom_line(aes(C001, avg_learning)) +
  facet_grid(.~DID) +
  facet_wrap(.~DID)
## Warning: Removed 226 row(s) containing missing values (geom_path).

child_ica %>% 
  filter(PR009 != "NA", 
         PR004 != "NA", 
         C003 != "NA", 
         C001 != "NA",
         RID == 6) %>% 
  group_by(DID, PR004, C001) %>% 
  mutate(avg_enrollment = mean(C003 == 3, na.rm = TRUE)) %>% 
  ggplot(aes(C001, avg_enrollment, group = PR004, color = factor(PR004))) +
  geom_line(aes(C001, avg_enrollment)) +
  facet_grid(.~DID) +
  facet_wrap(.~DID)

child_ica %>%
  filter(PR009 != "NA", 
         PR004 != "NA", 
         C003 != "NA", 
         C001 != "NA",
         RID == 6) %>% 
  group_by(DID, PR009, C001) %>% 
  mutate(avg_enrollment = mean(C003 == 3, na.rm = TRUE)) %>% 
  ggplot(aes(C001, avg_enrollment, group = PR009, color = factor(PR009))) +
  geom_line(aes(C001, avg_enrollment)) +
  facet_grid(.~DID) +
  facet_wrap(.~DID)

child_ica %>%
  group_by(DID, PR006) %>% 
  ggplot(aes(PR006)) +
  geom_histogram(aes(PR006), stat = "count") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 5)) +
  facet_grid(.~RID)
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Child, Household

child_ica %>% left_join(house, by = "HHID") %>% 
  filter(DID.x == 266) %>% 
  ggplot(aes(H002.x)) +
  geom_histogram(aes(H002.x), stat = "count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

child_ica %>% 
  ggplot(aes(x = factor(DID), y = factor(H002), fill = factor(H002))) +
  geom_bar(aes(x = factor(DID), y = factor(H002)), stat = "identity", position = "fill") +
  facet_grid(.~RID, scales = "free") +
  facet_wrap(.~RID, scales = "free") +
  theme(axis.text.x = element_text(angle = 90, size = 7, hjust = 1)) +
  coord_cartesian(ylim = c(0,1), clip = "off", expand = FALSE)

child_ica %>% 
  filter(C003 != "NA", RID == 6) %>% 
  group_by(DID, H002) %>% 
  mutate(avg_enrollment = mean(C003 == 3)) %>% 
  ggplot(aes(factor(DID), avg_enrollment, group = H002, fill = factor(H002))) +
  geom_col(position = "dodge")

child_ica %>% 
  filter(PR004 != "NA", PR009 != "NA", C010 != "NA") %>% 
  group_by(DID) %>% 
  mutate(mother_edu_rate = sum(PR004 == -1)/length(PR004)) %>% 
  ggplot(aes(DID, mother_edu_rate, shape = factor(RID), color = factor(RID))) +
  geom_point() +
  scale_shape_manual(values = 0:8)

Average mother education experience

child_ica %>% 
  filter(PR004 != "NA", PR009 != "NA", C010 != "NA") %>% 
  group_by(DID) %>% 
  mutate(mother_edu_rate = sum(PR004 == -1)/length(PR004)) %>% ungroup() %>% 
  ggplot(aes(factor(RID), mother_edu_rate, group = RID)) +
  geom_violin()

How much are mother and father educated?

child_ica %>% 
  filter(PR004 != "NA", PR009 != "NA", C010 != "NA", C002 != "NA") %>% 
  ggplot(aes(PR005)) +
  geom_histogram(stat = "count") +
  coord_flip()
## Warning: Ignoring unknown parameters: binwidth, bins, pad

child_ica %>% 
  ggplot(aes(PR006)) +
  geom_histogram(stat = "count") +
  coord_flip()
## Warning: Ignoring unknown parameters: binwidth, bins, pad

child_ica %>% 
  filter(PR004 != "NA", PR009 != "NA", C010 != "NA", C002 != "NA") %>%
  ggplot(aes(PR010)) +
  geom_histogram(stat = "count") +
  coord_flip()
## Warning: Ignoring unknown parameters: binwidth, bins, pad

child_ica %>% 
  ggplot(aes(PR011)) +
  geom_histogram(stat = "count") +
  coord_flip()
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Pak, children education stage

child_ica %>% 
  filter(PR004 != "NA", PR009 != "NA", C010 != "NA", C002 != "NA") %>% 
  group_by(C005) %>% 
  summarize(n = n()) %>%
  mutate(C005 = fct_reorder(C005, n)) %>% 
  ggplot(aes(C005, n)) +
  geom_col() +
  coord_flip()
## `summarise()` ungrouping output (override with `.groups` argument)

Hunza, children education stage

child_ica %>% 
  filter(PR004 != "NA", PR009 != "NA", !is.na(C010), C002 != "NA", !is.na(C005), DID == 266) %>% 
  group_by(C005) %>% 
  summarize(n = n()) %>%
  mutate(C005 = fct_reorder(C005, n)) %>% 
  ggplot(aes(C005, n)) +
  geom_col() +
  coord_flip()
## `summarise()` ungrouping output (override with `.groups` argument)

Create dummy variale with regard to region

child_ica_dummy <- child_ica
child_ica_dummy$Panjab <- ifelse(child_ica_dummy$RID == 2, 1, 0)
child_ica_dummy$Sindh <- ifelse(child_ica_dummy$RID == 3, 1, 0)
child_ica_dummy$Balochistan <- ifelse(child_ica_dummy$RID == 4, 1, 0)
child_ica_dummy$Khyber_Pakhtunkhwa <- ifelse(child_ica_dummy$RID == 5, 1, 0)
child_ica_dummy$Gilgit_Baltistan <- ifelse(child_ica_dummy$RID == 6, 1, 0)
child_ica_dummy$Azad_Jammu_and_Kashmir <- ifelse(child_ica_dummy$RID == 7, 1, 0)
child_ica_dummy$Islamabad_ICT <- ifelse(child_ica_dummy$RID == 8, 1, 0)
child_ica_dummy$Federally_Administrated_Tribal_Areas <- ifelse(child_ica_dummy$RID == 9, 1, 0)

Number of Children in a household

child_ica %>%
  filter(!is.na(PR004), !is.na(PR009)) %>% 
  filter(HHID == 96546) %>% 
  select(CID, PRID, C002, C001, PR001, VID, C003, DID, RID)
child_ica %>% 
  group_by(HHID) %>%
  mutate(n = n()) %>% 
  ggplot(aes(factor(n), fill = factor(RID))) +
  geom_bar(position = "dodge") +
  facet_grid(.~RID) +
  facet_wrap(.~RID)

Enrollment and Gender within Gilgit-Baltistan

child_ica %>% 
  filter(!is.na(C002), RID == 6) %>% 
  group_by(DID, C001, C002) %>% 
  mutate(enroll = mean(C003 == 3)) %>% 
  ggplot(aes(C001, enroll, group = C002, color = factor(C002))) +
  geom_line() +
  facet_wrap(.~DID)

Logistic Regression Analysis

Making Dataframe With Dummy Variables

child_ica_dummy <- child_ica_dummy %>% filter(!is.na(C002), !is.na(C003), !is.na(PR004), !is.na(PR009))
child_ica_dummy$C002_01 <- ifelse(child_ica_dummy$C002 == -1, 1, 0)
child_ica_dummy$C003_01 <- ifelse(child_ica_dummy$C003 == 3, 1, 0)
child_ica_dummy$PR004_01 <- ifelse(child_ica_dummy$PR004 == -1, 1, 0)
child_ica_dummy$PR009_01 <- ifelse(child_ica_dummy$PR009 == -1, 1, 0)
child_ica_dummy$PR004_PR009_01 <- ifelse(child_ica_dummy$PR004 == -1 | child_ica_dummy$PR009 == -1, 1, 0)

n_children_in_household <- child_ica_dummy %>% 
  group_by(HHID) %>% 
  summarize(n_children_in_household = length(unique(CID)))
## `summarise()` ungrouping output (override with `.groups` argument)
child_ica_dummy <- child_ica_dummy %>% left_join(n_children_in_household)
## Joining, by = "HHID"

Eliminated NAs

child_ica %>% 
  summarize(C002_na = sum(is.na(C002)),
            C003_na = sum(is.na(C003)),
            PR004_na = sum(is.na(PR004)),
            PR009_na = sum(is.na(PR009)))

Eliminated Rows in Total

data.frame(original_rows = nrow(child_ica),
           eliminated_rows = nrow(child_ica) - nrow(child_ica_dummy),
           ratio = (nrow(child_ica)-nrow(child_ica_dummy))/nrow(child_ica))

Eliminated NAs Hunza

child_ica %>% 
  filter(DID == 266) %>% 
  summarize(C002_na = sum(is.na(C002)),
            C003_na = sum(is.na(C003)),
            PR004_na = sum(is.na(PR004)),
            PR009_na = sum(is.na(PR009)))

Eliminated Rows in Total Hunza

data.frame(original_rows = nrow(child_ica %>% filter(DID == 266)),
           eliminated_rows = nrow(child_ica %>% filter(DID == 266)) - nrow(child_ica_dummy %>% filter(DID == 266)),
           ratio = (nrow(child_ica %>% filter(DID == 266) %>% filter(DID == 266))-nrow(child_ica_dummy %>% filter(DID == 266)))/nrow(child_ica %>% filter(DID == 266)))

GLM C003_01 ~ n_children_in_household + PR004_PR009_01 + C002_01

glm_child <- glm(C003_01 ~ n_children_in_household + PR004_PR009_01 + C002_01, family = "binomial", data = child_ica_dummy)

ggPredict(glm_child, se = TRUE, colorAsFactor = TRUE, show.summary = TRUE, point = TRUE)
## 
## Call:
## glm(formula = C003_01 ~ n_children_in_household + PR004_PR009_01 + 
##     C002_01, family = "binomial", data = child_ica_dummy)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8850  -1.2383   0.6563   0.8621   1.3071  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              0.934846   0.013780   67.84   <2e-16 ***
## n_children_in_household -0.055206   0.002935  -18.81   <2e-16 ***
## PR004_PR009_01           0.711645   0.009062   78.53   <2e-16 ***
## C002_01                 -0.572083   0.009041  -63.28   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 299726  on 245746  degrees of freedom
## Residual deviance: 289072  on 245743  degrees of freedom
## AIC: 289080
## 
## Number of Fisher Scoring iterations: 4
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

within hunza

# glm_child_hunza <- glm(C003_01 ~ C001 + C002_01 + PR004_01, family = "binomial", data = child_ica_dummy %>% filter(DID == 266))
glm_child_hunza <- glm(C003_01 ~ n_children_in_household + PR004_PR009_01 + C002_01, family = "binomial", data = child_ica_dummy %>% filter(DID == 266))

ggPredict(glm_child_hunza, se = TRUE, show.summary = TRUE, point = TRUE, colorAsFactor = TRUE)
## 
## Call:
## glm(formula = C003_01 ~ n_children_in_household + PR004_PR009_01 + 
##     C002_01, family = "binomial", data = child_ica_dummy %>% 
##     filter(DID == 266))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5255   0.3335   0.3611   0.4126   0.6056  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              2.74787    0.35406   7.761 8.43e-15 ***
## n_children_in_household -0.16354    0.07071  -2.313   0.0207 *  
## PR004_PR009_01           0.44052    0.21451   2.054   0.0400 *  
## C002_01                  0.12227    0.19347   0.632   0.5274    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 838.79  on 1609  degrees of freedom
## Residual deviance: 826.06  on 1606  degrees of freedom
## AIC: 834.06
## 
## Number of Fisher Scoring iterations: 5
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
In Hunza, gender difference does not provide much difference on child enrollment. instead, parents education has a huge impact.
ggPredict(glm_child_hunza, se = TRUE, colorAsFactor = TRUE, show.summary = TRUE, point = TRUE) +
  coord_cartesian(xlim = c(3,5), ylim = c(0.55, 0.9))
## 
## Call:
## glm(formula = C003_01 ~ n_children_in_household + PR004_PR009_01 + 
##     C002_01, family = "binomial", data = child_ica_dummy %>% 
##     filter(DID == 266))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5255   0.3335   0.3611   0.4126   0.6056  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              2.74787    0.35406   7.761 8.43e-15 ***
## n_children_in_household -0.16354    0.07071  -2.313   0.0207 *  
## PR004_PR009_01           0.44052    0.21451   2.054   0.0400 *  
## C002_01                  0.12227    0.19347   0.632   0.5274    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 838.79  on 1609  degrees of freedom
## Residual deviance: 826.06  on 1606  degrees of freedom
## AIC: 834.06
## 
## Number of Fisher Scoring iterations: 5
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

exponential transformation

exp(glm_child_hunza$coefficients)
##             (Intercept) n_children_in_household          PR004_PR009_01 
##              15.6093953               0.8491308               1.5535098 
##                 C002_01 
##               1.1300567

confidence interval (intercept and coefficient)

confint(glm_child_hunza, level = 0.95)
## Waiting for profiling to be done...
##                              2.5 %      97.5 %
## (Intercept)              2.0690843  3.45821705
## n_children_in_household -0.3018419 -0.02433219
## PR004_PR009_01           0.0115443  0.85445608
## C002_01                 -0.2575389  0.50262655

exponential transformation of confidence interval (odds ratio of intercept and coefficient)

exp(confint(glm_child_hunza, level = 0.95))
## Waiting for profiling to be done...
##                             2.5 %     97.5 %
## (Intercept)             7.9175694 31.7602989
## n_children_in_household 0.7394550  0.9759614
## PR004_PR009_01          1.0116112  2.3500958
## C002_01                 0.7729515  1.6530574

AIC

extractAIC(glm_child_hunza)
## [1]   4.0000 834.0574

BIC

extractAIC(glm_child_hunza, k = log(nrow(glm_child_hunza$data)))
## [1]   4.0000 855.5934

effectiveness of explanatory variables

glm_child_hunza_null <- glm(C003_01~1, family = "binomial", data = child_ica_dummy %>% filter(DID == 266))
anova <- anova(glm_child_hunza_null, glm_child_hunza, test = "Chisq")
anova

variables selection

step(glm_child_hunza_null, direction = "both", 
     scope = (~ C001 + C002_01 + PR004_01 + PR009_01))
## Start:  AIC=840.79
## C003_01 ~ 1
## 
##            Df Deviance    AIC
## + C001      1   732.49 736.49
## + PR004_01  1   824.25 828.25
## + PR009_01  1   830.82 834.82
## <none>          838.79 840.79
## + C002_01   1   838.58 842.58
## 
## Step:  AIC=736.49
## C003_01 ~ C001
## 
##            Df Deviance    AIC
## + PR004_01  1   707.11 713.11
## + PR009_01  1   717.26 723.26
## <none>          732.49 736.49
## + C002_01   1   732.49 738.49
## - C001      1   838.79 840.79
## 
## Step:  AIC=713.11
## C003_01 ~ C001 + PR004_01
## 
##            Df Deviance    AIC
## <none>          707.11 713.11
## + PR009_01  1   705.74 713.74
## + C002_01   1   707.11 715.11
## - PR004_01  1   732.49 736.49
## - C001      1   824.25 828.25
## 
## Call:  glm(formula = C003_01 ~ C001 + PR004_01, family = "binomial", 
##     data = child_ica_dummy %>% filter(DID == 266))
## 
## Coefficients:
## (Intercept)         C001     PR004_01  
##     -0.4789       0.3039       1.0316  
## 
## Degrees of Freedom: 1609 Total (i.e. Null);  1607 Residual
## Null Deviance:       838.8 
## Residual Deviance: 707.1     AIC: 713.1
# step(glm_child_null, direction = "both", 
#      scope = (~ C001 + C002 + PR004_01 + PR009_01 + 
#                 Panjab + Sindh + Balochistan + Khyber_Pakhtunkhwa + Gilgit_Baltistan + 
#                 Azad_Jammu_and_Kashmir + Islamabad_ICT))

multicolinearity

vif(glm_child_hunza)
## n_children_in_household          PR004_PR009_01                 C002_01 
##                1.087196                1.080729                1.006453
glm_child_hunza <- glm(C003_01 ~ C001 + C002_01 + PR009_01, family = "binomial", data = child_ica_dummy %>% filter(DID == 266))

ggPredict(glm_child_hunza, se = TRUE, colorAsFactor = TRUE, show.summary = TRUE, point = TRUE)
## 
## Call:
## glm(formula = C003_01 ~ C001 + C002_01 + PR009_01, family = "binomial", 
##     data = child_ica_dummy %>% filter(DID == 266))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2344   0.1622   0.2536   0.4405   0.9815  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.417574   0.296711  -1.407    0.159    
## C001         0.300175   0.032679   9.186  < 2e-16 ***
## C002_01     -0.002948   0.201334  -0.015    0.988    
## PR009_01     0.839955   0.210274   3.995 6.48e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 838.79  on 1609  degrees of freedom
## Residual deviance: 717.26  on 1606  degrees of freedom
## AIC: 725.26
## 
## Number of Fisher Scoring iterations: 6
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

ggPredict(glm_child_hunza, se = TRUE, colorAsFactor = TRUE, show.summary = TRUE, point = TRUE) +
  coord_cartesian(xlim = c(3,5), ylim = c(0.55, 0.9))
## 
## Call:
## glm(formula = C003_01 ~ C001 + C002_01 + PR009_01, family = "binomial", 
##     data = child_ica_dummy %>% filter(DID == 266))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2344   0.1622   0.2536   0.4405   0.9815  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.417574   0.296711  -1.407    0.159    
## C001         0.300175   0.032679   9.186  < 2e-16 ***
## C002_01     -0.002948   0.201334  -0.015    0.988    
## PR009_01     0.839955   0.210274   3.995 6.48e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 838.79  on 1609  degrees of freedom
## Residual deviance: 717.26  on 1606  degrees of freedom
## AIC: 725.26
## 
## Number of Fisher Scoring iterations: 6
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

glm_child_hunza <- glm(C003_01 ~ C001 + C002_01 + PR004_01, family = "binomial", data = child_ica_dummy %>% filter(DID == 266))

ggPredict(glm_child_hunza, se = TRUE, colorAsFactor = TRUE, show.summary = TRUE, point = TRUE)
## 
## Call:
## glm(formula = C003_01 ~ C001 + C002_01 + PR004_01, family = "binomial", 
##     data = child_ica_dummy %>% filter(DID == 266))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2906   0.1572   0.2702   0.4206   1.0017  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.48302    0.28454  -1.698   0.0896 .  
## C001         0.30382    0.03258   9.326  < 2e-16 ***
## C002_01      0.00938    0.20272   0.046   0.9631    
## PR004_01     1.03155    0.20425   5.051 4.41e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 838.79  on 1609  degrees of freedom
## Residual deviance: 707.11  on 1606  degrees of freedom
## AIC: 715.11
## 
## Number of Fisher Scoring iterations: 6
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

multi-colinearity?

child_ica_dummy %>% 
  filter(DID == 266) %>% 
  group_by(HHID) %>% 
  summarize(avg = length(unique(PRID)))
## `summarise()` ungrouping output (override with `.groups` argument)
child_ica_dummy %>% 
  filter(DID == 266) %>% 
  group_by(CID) %>% 
  summarize(avg = (PR004_01 + PR009_01)/2) %>% 
  ggplot(aes(factor(avg))) +
  geom_histogram(stat = "count") 
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Districts in Gilgit-Baltistan

child_ica_dummy %>% filter(RID == 6) %>% summarize(unique(DID))
sapply(260:266, function(dist){
  glm_child_dist <- glm(C003_01 ~ C002_01, family = "binomial", data = child_ica_dummy %>% filter(RID == 6, DID == dist))
  glm_child_dist_null <- glm(C003_01~1, family = "binomial", data = child_ica_dummy %>% filter(RID == 6, DID == dist))
anova <- anova(glm_child_dist_null, glm_child_dist, test = "Chisq")
data.frame(result = anova$`Pr(>Chi)`[2], DID = dist, chi_0.05 = anova$`Pr(>Chi)`[2] >= 0.05)
})
##          [,1]       [,2]          [,3]       [,4]       [,5]      [,6]     
## result   0.02323511 1.711351e-125 0.02518717 0.04226537 0.1507538 0.7348869
## DID      260        261           262        263        264       265      
## chi_0.05 FALSE      FALSE         FALSE      FALSE      TRUE      TRUE     
##          [,7]     
## result   0.6463022
## DID      266      
## chi_0.05 TRUE
Within Gilgit-Baltistan, 3 of 7 districts including Hunza has Chi > 0.05

Logistic Regression and Anova for All Districts

id <- child_ica_dummy %>% summarize(unique(DID))
id <- as.matrix(id)
id <- as.numeric(id[,1])

dist_logist <- sapply(id, function(dist){
  
  glm_child_dist <- glm(C003_01 ~ C002_01, 
                        family = "binomial", 
                        data = child_ica_dummy %>% 
                          filter(DID == dist))
  
  glm_child_dist_null <- glm(C003_01~1, 
                             family = "binomial", 
                             data = child_ica_dummy %>% 
                               filter(DID == dist))

  anova <- anova(glm_child_dist_null, glm_child_dist, test = "Chisq")

data.frame(DID = dist, 
           chi = anova$`Pr(>Chi)`[2], 
           chi_largerThan_0.05 = anova$`Pr(>Chi)`[2] >= 0.05, 
           chi_largerThan_0.1 = anova$`Pr(>Chi)`[2] >= 0.1)
})

anova_dist <- as.data.frame(as.tibble(t(dist_logist)))
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
anova_dist$DID <- as.numeric(anova_dist$DID)
anova_dist$chi <- as.numeric(anova_dist$chi)
anova_dist$chi_largerThan_0.1 <- as.logical(anova_dist$chi_largerThan_0.1)
anova_dist$chi_largerThan_0.05 <- as.logical(anova_dist$chi_largerThan_0.05)
anova_dist %>% 
  summarize(total_chi_largerThan_0.05 = sum(as.numeric(chi_largerThan_0.05)),
            ratio_0.05 = sum(as.numeric(chi_largerThan_0.05))/length(chi_largerThan_0.05), 
            total_chi_largerThan_0.1 = sum(as.numeric(chi_largerThan_0.1)),
            ratio_0.1 = sum(as.numeric(chi_largerThan_0.1))/length(chi_largerThan_0.1))

DIDs with chi >= 0.05

anova_dist %>% 
  filter(chi_largerThan_0.05 == TRUE) %>% 
  select(-chi_largerThan_0.1)

DID chi > 0.05

DID_chi_0.05 <- anova_dist %>% 
  filter(chi_largerThan_0.05 == TRUE) %>% 
  select(DID) %>% 
  pull()
DID_chi_0.05
##  [1] 146 151 156 158 159 162 163 164 167 171 173 176 178 179 189 196 245 257 264
## [20] 265 266 267 268 269 270 271 272 273 274 276

DIDs with chi >= 0.1

anova_dist %>% 
  filter(chi_largerThan_0.1 == TRUE) %>% 
  select(-chi_largerThan_0.05)

DID chi > 0.1

DID_chi_0.1 <- anova_dist %>% 
  filter(chi_largerThan_0.1 == TRUE) %>% 
  select(DID) %>% 
  pull()
child %>% 
  filter(C002 == -1) %>% 
  group_by(DID) %>% 
  mutate(avg_learning = mean(C010, na.rm = TRUE)) %>% 
  ggplot(aes(DID, avg_learning)) +
  geom_point(data = child %>% 
               filter(C002 == -1 & DID == DID_chi_0.05) %>% 
               group_by(DID) %>%
               mutate(avg_learning = mean(C010, na.rm = TRUE)), size = 4, color = "red") +
  geom_point(aes(color = RID)) +
  geom_text(aes(label = DID), nudge_x = 5, check_overlap = TRUE)
## Warning in DID == DID_chi_0.05: 長いオブジェクトの長さが短いオブジェクトの長さの
## 倍数になっていません

child %>% 
  filter(DID == DID_chi_0.1) %>%
  group_by(DID, C001) %>% 
  mutate(enroll = mean(C003 == 3)) %>% 
  ggplot(aes(C001, enroll)) +
  geom_point() +
  geom_smooth() +
  facet_wrap(DID~.)
## Warning in DID == DID_chi_0.1: 長いオブジェクトの長さが短いオブジェクトの長さの
## 倍数になっていません
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

education stages

child_ica_dummy %>% 
  group_by(DID) %>% 
  mutate(n = length(C003),
            dropout = sum(C003 == 2),
            dropout_ratio = mean(C003 == 2),
            never = sum(C003 == 1),
            never_ratio = mean(C003 == 1)) %>% 
  ggplot(aes(DID, never_ratio, color = factor(RID))) +
  geom_point()

Average Number of Children in a household

child_ica %>% 
  group_by(HHID) %>% 
  mutate(n_children = length(unique(CID))) %>% 
  ungroup() %>% 
  group_by(DID) %>% 
  mutate(avg_n_children = mean(n_children)) %>% 
  ggplot(aes(DID, avg_n_children, color = factor(RID))) +
  geom_point()

child_ica %>% 
  ggplot(aes(C005)) +
  geom_histogram(stat = "count") +
  coord_flip()
## Warning: Ignoring unknown parameters: binwidth, bins, pad

unique(child_ica$C005)
##  [1] "1"       "2"       ""        "4"       "3"       "Nursery" "6"      
##  [8] "7"       "5"       "8"       "Kachi"   "10"      "9"       "KG"     
## [15] "PG"      "Pakki"   "11"      "Prep"    "ECE"     "12"      "Hafiz"

fit the model

glm_child <- glm(C003_01~ C001 + C002_01 + PR004_01, family = "binomial", data = child_ica_dummy)

# glm_child <- glm(enroll01~ C001 + C002 + PR004 + PR009 +
#                    Panjab + Sindh + Balochistan + Khyber_Pakhtunkhwa + 
#                    Gilgit_Baltistan + Azad_Jammu_and_Kashmir + Islamabad_ICT, 
#                  family = "binomial", data = child_ica_dummy %>% filter(!is.na(C002)))
summary(glm_child)
## 
## Call:
## glm(formula = C003_01 ~ C001 + C002_01 + PR004_01, family = "binomial", 
##     data = child_ica_dummy)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4716  -1.0691   0.6141   0.8525   1.4442  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.567479   0.012767  -44.45   <2e-16 ***
## C001         0.174066   0.001340  129.90   <2e-16 ***
## C002_01     -0.562952   0.009369  -60.09   <2e-16 ***
## PR004_01     0.788430   0.010620   74.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 299726  on 245746  degrees of freedom
## Residual deviance: 271929  on 245743  degrees of freedom
## AIC: 271937
## 
## Number of Fisher Scoring iterations: 4

exponential transformation

exp(glm_child$coefficients)
## (Intercept)        C001     C002_01    PR004_01 
##   0.5669529   1.1901343   0.5695255   2.1999390

confidence interval (intercept and coefficient)

confint(glm_child, level = 0.95)
## Waiting for profiling to be done...
##                  2.5 %     97.5 %
## (Intercept) -0.5925094 -0.5424624
## C001         0.1714426  0.1766953
## C002_01     -0.5813180 -0.5445920
## PR004_01     0.7676346  0.8092649

exponential transformation of confidence interval (odds ratio of intercept and coefficient)

exp(confint(glm_child, level = 0.95))
## Waiting for profiling to be done...
##                 2.5 %    97.5 %
## (Intercept) 0.5529380 0.5813151
## C001        1.1870160 1.1932675
## C002_01     0.5591609 0.5800784
## PR004_01    2.1546637 2.2462561

AIC

extractAIC(glm_child)
## [1]      4.0 271937.4

BIC

extractAIC(glm_child, k = log(nrow(glm_child$data)))
## [1]      4.0 271979.1

effectiveness of explanatory variables

glm_child_null <- glm(C003_01~1, family = "binomial", data = child_ica_dummy)
anova(glm_child_null, glm_child, test = "Chisq")

variables selection

step(glm_child_null, direction = "both", 
     scope = (~ C001 + C002_01 + PR004_01 + PR009_01))
## Start:  AIC=299727.8
## C003_01 ~ 1
## 
##            Df Deviance    AIC
## + C001      1   281139 281143
## + PR009_01  1   293719 293723
## + PR004_01  1   295038 295042
## + C002_01   1   295943 295947
## <none>          299726 299728
## 
## Step:  AIC=281142.5
## C003_01 ~ C001
## 
##            Df Deviance    AIC
## + PR009_01  1   274422 274428
## + PR004_01  1   275560 275566
## + C002_01   1   277803 277809
## <none>          281139 281143
## - C001      1   299726 299728
## 
## Step:  AIC=274428
## C003_01 ~ C001 + PR009_01
## 
##            Df Deviance    AIC
## + C002_01   1   270765 270773
## + PR004_01  1   272824 272832
## <none>          274422 274428
## - PR009_01  1   281139 281143
## - C001      1   293719 293723
## 
## Step:  AIC=270773.3
## C003_01 ~ C001 + PR009_01 + C002_01
## 
##            Df Deviance    AIC
## + PR004_01  1   269072 269082
## <none>          270765 270773
## - C002_01   1   274422 274428
## - PR009_01  1   277803 277809
## - C001      1   289624 289630
## 
## Step:  AIC=269081.6
## C003_01 ~ C001 + PR009_01 + C002_01 + PR004_01
## 
##            Df Deviance    AIC
## <none>          269072 269082
## - PR004_01  1   270765 270773
## - PR009_01  1   271929 271937
## - C002_01   1   272824 272832
## - C001      1   288269 288277
## 
## Call:  glm(formula = C003_01 ~ C001 + PR009_01 + C002_01 + PR004_01, 
##     family = "binomial", data = child_ica_dummy)
## 
## Coefficients:
## (Intercept)         C001     PR009_01      C002_01     PR004_01  
##     -0.7815       0.1760       0.5677      -0.5762       0.4923  
## 
## Degrees of Freedom: 245746 Total (i.e. Null);  245742 Residual
## Null Deviance:       299700 
## Residual Deviance: 269100    AIC: 269100
# step(glm_child_null, direction = "both", 
#      scope = (~ C001 + C002 + PR004_01 + PR009_01 + 
#                 Panjab + Sindh + Balochistan + Khyber_Pakhtunkhwa + Gilgit_Baltistan + 
#                 Azad_Jammu_and_Kashmir + Islamabad_ICT))

multicolinearity

vif(glm_child)
##     C001  C002_01 PR004_01 
## 1.010465 1.004067 1.013630

Hunza

child_ica %>% 
  filter(DID == 266, PR009 == 0, !is.na(C008a)) %>% 
  summarize(n = length(unique(CID)),
            paid_tuition = sum(C008a == -1), 
            ratio = paid_tuition/n)
child_ica %>% 
  filter(DID == 266, PR004 == 0, !is.na(C008a)) %>% 
  summarize(n = length(unique(CID)),
            paid_tuition = sum(C008a == -1), 
            ratio = paid_tuition/n)
child_ica %>% 
  filter(DID == 266, !is.na(PR009)) %>% 
  summarize(father_edu_ratio = sum(PR009 == -1)/length(PR009))
child_ica %>% 
  filter(!is.na(PR009)) %>% 
  summarize(father_edu_ratio = sum(PR009 == -1)/length(PR009))
child_ica %>% 
  filter(!is.na(PR009)) %>% 
  group_by(DID) %>% 
  mutate(father_edu_ratio = sum(PR009 == -1)/length(PR009)) %>% 
  ggplot(aes(factor(DID), father_edu_ratio)) +
  geom_point()

child_ica %>% 
  filter(!is.na(PR009), !is.na(PR004)) %>% 
  group_by(DID) %>% 
  mutate(mother_edu_ratio = sum(PR004 == -1)/length(PR004)) %>% 
  ggplot(aes(factor(DID), mother_edu_ratio)) +
  geom_point()

child_ica %>% 
  group_by(DID) %>% 
  mutate(avg_paid = mean(C008a == -1, na.rm = TRUE)) %>% 
  ggplot(aes(DID, avg_paid)) +
  geom_point()+
  scale_y_continuous(limits = c(0,1))

child_ica %>% 
  filter(!is.na(C002)) %>%
  group_by(DID) %>% 
  summarize(avg_paid = mean(C008a == -1),
         female = mean(child_ica %>% filter(C002 == -1) %>% 
                         group_by(DID) %>% .$C003 == 3),
         male = mean(child_ica %>% filter(C002 == 0) %>% 
                       group_by(DID) %>% .$C003 == 3),
         ratio = female/male)
## `summarise()` ungrouping output (override with `.groups` argument)
child_ica %>% 
  filter(!is.na(C002)) %>%
  group_by(DID, C002) %>% 
  summarize(enroll = mean(C003 == 3)) %>% 
  ungroup() %>% 
  spread(C002, enroll) %>% 
  summarize(DID = DID,
            gender_ratio = .$"-1"/.$"0") %>% 
  ggplot(aes(DID, gender_ratio)) +
  geom_point() +
  scale_y_continuous(limits = c(0,1))
## `summarise()` regrouping output by 'DID' (override with `.groups` argument)
## Warning: Removed 16 rows containing missing values (geom_point).

correlation b/w paid-tuition and enrollment?

child_ica %>% 
  filter(!is.na(C002)) %>%
  group_by(DID) %>% 
  mutate(paid_tuition = mean(C008a == -1, na.rm = TRUE)) %>% 
  group_by(DID, C002) %>% 
  mutate(enroll = mean(C003 == 3)) %>% 
  ggplot(aes(paid_tuition, enroll)) +
  geom_point() +
  geom_smooth() +
  facet_grid(.~C002)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'